suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities

Settings

data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'

wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)

figdir <- paste0(wd, 'Figures/DRS_seqcontent/')
tabledir <- paste0(wd, 'Tables/DRS/Seq_content/')

theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

Functions

calc_base_percentage <- function(df) {
  
  df |> 
    select(-seq) |> 
    pivot_longer(
      cols = -c(transcript_id, seq_length),
      names_to = 'pattern', values_to = 'num',
      names_pattern = "(.*)_num"
    ) |> 
    mutate(
      # maximum number == seq_length (if single base (i.e. A, C, G, T))
      # maximum number == seq_length - 1 (if two bases (i.e. CC))
      percent = 100 * num / (seq_length - str_length(pattern) + 1) 
    ) |> 
    pivot_wider(
      id_cols = c(transcript_id, seq_length),
      names_from = pattern, values_from = percent, names_glue = "{pattern}_percent"
    )
  
}

Read data

espresso_quantification <- 
  read_tsv(
    paste0(wd, 'Tables/Espresso/espresso_deseq2_genetype2_isDET_2024-04-18.tsv')
  )
## Rows: 36717 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (11): transcript_id, transcript_type, transcript_name, gene_id, gene_typ...
## dbl (18): siMETTL2A_baseMean, siMETTL2A_log2FoldChange, siMETTL2A_lfcSE, siM...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_quantification
## # A tibble: 36,717 × 29
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 36,707 more rows
## # ℹ 23 more variables: siMETTL2A_baseMean <dbl>,
## #   siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## #   siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## #   siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …
m3C_transcripts <- 
  read_tsv(
    paste0(wd, 'Tables/DRS/Positions/common_sig_seqs_in_intensity_up_2024-04-22.tsv.gz')
  ) |> 
  filter(grepl('..C..', ref_kmer)) |> 
  select(transcript_id) |> 
  mutate(is_methylated = TRUE) |> 
  distinct()
## Rows: 605 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (30): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (35): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
m3C_transcripts 
## # A tibble: 71 × 2
##    transcript_id     is_methylated
##    <chr>             <lgl>        
##  1 ENST00000429711.7 TRUE         
##  2 ENST00000647248.2 TRUE         
##  3 ENST00000389680.2 TRUE         
##  4 ENST00000361390.2 TRUE         
##  5 ENST00000361453.3 TRUE         
##  6 ENST00000387347.2 TRUE         
##  7 ENST00000361624.2 TRUE         
##  8 ENST00000361739.1 TRUE         
##  9 ENST00000361899.2 TRUE         
## 10 ENST00000361227.2 TRUE         
## # ℹ 61 more rows
intensityup_transcripts <- 
  read_tsv(
    paste0(wd, 'Tables/DRS/Positions/common_sig_seqs_in_intensity_up_2024-04-22.tsv.gz')
  ) |> 
  #filter(grepl('..C..', ref_kmer)) |> 
  select(transcript_id) |> 
  mutate(is_intensityup = TRUE) |> 
  distinct()
## Rows: 605 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (30): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (35): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
intensityup_transcripts
## # A tibble: 85 × 2
##    transcript_id     is_intensityup
##    <chr>             <lgl>         
##  1 ENST00000429711.7 TRUE          
##  2 ENST00000647248.2 TRUE          
##  3 ENST00000389680.2 TRUE          
##  4 ENST00000361390.2 TRUE          
##  5 ENST00000361453.3 TRUE          
##  6 ENST00000387347.2 TRUE          
##  7 ENST00000361624.2 TRUE          
##  8 ENST00000361739.1 TRUE          
##  9 ENST00000361899.2 TRUE          
## 10 ENST00000361227.2 TRUE          
## # ℹ 75 more rows
transcript_seqs <- 
  read_tsv(
    '/Volumes/Mitsu_NGS_2/METTL2A/Database/Custom/Espresso_AsPC1/Espresso_AsPC1.transcripts.tsv', 
    col_names = c('transcript_id', 'seq', 'seq_length')
  )
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, seq
## dbl (1): seq_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
transcript_seqs
## # A tibble: 36,717 × 3
##    transcript_id      seq                                             seq_length
##    <chr>              <chr>                                                <dbl>
##  1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGG…        987
##  2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGG…       2252
##  3 ENST00000420393.5  CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGGCAGAGTT…        854
##  4 ENST00000698415.1  GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTCTGCTAGC…       6597
##  5 ENST00000698416.1  CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTCACACACT…       5500
##  6 ENST00000488263.5  AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTTATCTTTC…       4528
##  7 ENST00000424814.5  GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACCTGCCGCT…       2038
##  8 ENST00000231948.9  AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCACAACATG…       2187
##  9 ENST00000432408.6  GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAGATCAGCA…       2203
## 10 ENST00000459840.5  ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTGATTAAAT…        723
## # ℹ 36,707 more rows

Calculate the number of CC in each transcript

num_CC <- 
  transcript_seqs |> 
  mutate(
    A_num = str_count(seq, 'A'),
    C_num = str_count(seq, 'C'),
    G_num = str_count(seq, 'G'),
    T_num = str_count(seq, 'T'),
    CC_num = str_count(seq, 'CC'),
    CA_num = str_count(seq, 'CA'),
    CG_num = str_count(seq, 'CG'),
    CT_num = str_count(seq, 'CT')
  )  
# |> 
#   mutate(
#     C_percent = 100 * C_num / seq_length,
#     CC_percent = 100 * CC_num / (seq_length - 1)
#   )
num_CC
## # A tibble: 36,717 × 11
##    transcript_id   seq   seq_length A_num C_num G_num T_num CC_num CA_num CG_num
##    <chr>           <chr>      <dbl> <int> <int> <int> <int>  <int>  <int>  <int>
##  1 ENST0000033943… AGCC…        987   283   182   224   298     35     60     17
##  2 ENST0000025160… AGCC…       2252   741   353   485   673     60    131     23
##  3 ENST0000042039… CAGC…        854   192   215   244   203     53     53     45
##  4 ENST0000069841… GATG…       6597  2120  1105  1260  2112    179    447     34
##  5 ENST0000069841… CATG…       5500  1818   893   999  1790    143    369     26
##  6 ENST0000048826… AGGA…       4528  1426   736   885  1481    126    271     26
##  7 ENST0000042481… GAGA…       2038   691   368   395   584     73    138     21
##  8 ENST0000023194… AGAC…       2187   726   400   435   626     80    147     25
##  9 ENST0000043240… GCCT…       2203   728   405   439   631     82    147     26
## 10 ENST0000045984… ATGG…        723   230   132   150   211     26     42      9
## # ℹ 36,707 more rows
## # ℹ 1 more variable: CT_num <int>
percent_bases <- 
  num_CC |> 
  calc_base_percentage()
percent_bases
## # A tibble: 36,717 × 10
##    transcript_id   seq_length A_percent C_percent G_percent T_percent CC_percent
##    <chr>                <dbl>     <dbl>     <dbl>     <dbl>     <dbl>      <dbl>
##  1 ENST0000033943…        987      28.7      18.4      22.7      30.2       3.55
##  2 ENST0000025160…       2252      32.9      15.7      21.5      29.9       2.67
##  3 ENST0000042039…        854      22.5      25.2      28.6      23.8       6.21
##  4 ENST0000069841…       6597      32.1      16.8      19.1      32.0       2.71
##  5 ENST0000069841…       5500      33.1      16.2      18.2      32.5       2.60
##  6 ENST0000048826…       4528      31.5      16.3      19.5      32.7       2.78
##  7 ENST0000042481…       2038      33.9      18.1      19.4      28.7       3.58
##  8 ENST0000023194…       2187      33.2      18.3      19.9      28.6       3.66
##  9 ENST0000043240…       2203      33.0      18.4      19.9      28.6       3.72
## 10 ENST0000045984…        723      31.8      18.3      20.7      29.2       3.60
## # ℹ 36,707 more rows
## # ℹ 3 more variables: CA_percent <dbl>, CG_percent <dbl>, CT_percent <dbl>

Join information

espresso_quantification_numCC <- 
  espresso_quantification |> 
  left_join(percent_bases) |> 
  left_join(m3C_transcripts) |> 
  left_join(intensityup_transcripts) |> 
  replace_na(list(is_methylated = FALSE, is_intensityup = FALSE))
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
espresso_quantification_numCC
## # A tibble: 36,717 × 40
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 36,707 more rows
## # ℹ 34 more variables: siMETTL2A_baseMean <dbl>,
## #   siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## #   siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## #   siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …
colnames(espresso_quantification_numCC)
##  [1] "transcript_id"              "transcript_type"           
##  [3] "transcript_name"            "gene_id"                   
##  [5] "gene_type"                  "gene_name"                 
##  [7] "siMETTL2A_baseMean"         "siMETTL2A_log2FoldChange"  
##  [9] "siMETTL2A_lfcSE"            "siMETTL2A_stat"            
## [11] "siMETTL2A_pvalue"           "siMETTL2A_padj"            
## [13] "siMETTL2A_I_baseMean"       "siMETTL2A_I_log2FoldChange"
## [15] "siMETTL2A_I_lfcSE"          "siMETTL2A_I_stat"          
## [17] "siMETTL2A_I_pvalue"         "siMETTL2A_I_padj"          
## [19] "siMETTL2A_G_baseMean"       "siMETTL2A_G_log2FoldChange"
## [21] "siMETTL2A_G_lfcSE"          "siMETTL2A_G_stat"          
## [23] "siMETTL2A_G_pvalue"         "siMETTL2A_G_padj"          
## [25] "seqname"                    "genetype2"                 
## [27] "isUp"                       "isDown"                    
## [29] "common_DETs"                "seq_length"                
## [31] "A_percent"                  "C_percent"                 
## [33] "G_percent"                  "T_percent"                 
## [35] "CC_percent"                 "CA_percent"                
## [37] "CG_percent"                 "CT_percent"                
## [39] "is_methylated"              "is_intensityup"
espresso_quantification_numCC |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS/Seq_content/espresso_quantification_numCC_2024-07-29.tsv
## # A tibble: 36,717 × 40
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 36,707 more rows
## # ℹ 34 more variables: siMETTL2A_baseMean <dbl>,
## #   siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## #   siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## #   siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …

Plot

Sina plot (methylation)

C

percent_C_methylation_sina <- 
  espresso_quantification_numCC |> 
  ggplot(aes(
    x = is_methylated |> factor(levels = c('TRUE', 'FALSE')), 
    y = C_percent
  )) +
  ggforce::geom_sina(colour = 'gray', size = .2) +
  geom_boxplot(width = .1, coef = Inf, lwd = .2) +
  coord_flip()
percent_C_methylation_sina |> 
  ggsave_multiple_formats(
    width = 5, height = 3, outdir = figdir, units = 'cm'
  )

espresso_quantification_numCC |> 
  rstatix::wilcox_test(C_percent ~ is_methylated)
## # A tibble: 1 × 7
##   .y.       group1 group2    n1    n2 statistic      p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl>
## 1 C_percent FALSE  TRUE   36646    71  1077184. 0.0122

CC

percent_CC_methylation_sina <- 
  espresso_quantification_numCC |> 
  ggplot(aes(
    x = is_methylated |> factor(levels = c('TRUE', 'FALSE')), 
    y = CC_percent
  )) +
  ggforce::geom_sina(colour = 'gray', size = .2) +
  geom_boxplot(width = .1, coef = Inf, lwd = .2) +
  coord_flip()
percent_CC_methylation_sina |> 
  ggsave_multiple_formats(
    width = 5, height = 3, outdir = figdir, units = 'cm'
  )

espresso_quantification_numCC |> 
  rstatix::wilcox_test(CC_percent ~ is_methylated)
## # A tibble: 1 × 7
##   .y.        group1 group2    n1    n2 statistic     p
## * <chr>      <chr>  <chr>  <int> <int>     <dbl> <dbl>
## 1 CC_percent FALSE  TRUE   36646    71   1079413 0.013

ECDF (methylation)

C

percent_C_methylation_ecdf <- 
  espresso_quantification_numCC |> 
  ggplot(aes(
    colour = is_methylated, 
    x = C_percent
  )) +
  stat_ecdf() +
  scale_color_manual(values = c('gray', '#1A8F3B'))
percent_C_methylation_ecdf |> 
  ggsave_multiple_formats(
    width = 4, height = 3.5, outdir = figdir, units = 'cm'
  )

CC

percent_CC_methylation_ecdf <- 
  espresso_quantification_numCC |> 
  ggplot(aes(
    colour = is_methylated, 
    x = CC_percent
  )) +
  stat_ecdf() +
  scale_color_manual(values = c('gray', '#1A8F3B'))
percent_CC_methylation_ecdf |> 
  ggsave_multiple_formats(
    width = 4, height = 3.5, outdir = figdir, units = 'cm'
  )

espresso_quantification_numCC |> 
  rstatix::wilcox_test(CC_percent ~ is_methylated)
## # A tibble: 1 × 7
##   .y.        group1 group2    n1    n2 statistic     p
## * <chr>      <chr>  <chr>  <int> <int>     <dbl> <dbl>
## 1 CC_percent FALSE  TRUE   36646    71   1079413 0.013

CG

#@percent_CC_methylation_ecdf <- 
  espresso_quantification_numCC |> 
  ggplot(aes(
    colour = is_methylated, 
    x = CA_percent
  )) +
  stat_ecdf() +
  scale_color_manual(values = c('gray', '#1A8F3B'))

espresso_quantification_numCC |> 
    rstatix::wilcox_test(CA_percent ~ is_methylated)
## # A tibble: 1 × 7
##   .y.        group1 group2    n1    n2 statistic     p
## * <chr>      <chr>  <chr>  <int> <int>     <dbl> <dbl>
## 1 CA_percent FALSE  TRUE   36646    71   1186151 0.198

Violin (common DETs + methylation)

CC, common DETs + methylation

espresso_quantification_numCC |> 
  ggplot(aes(
    x = common_DETs, y = CC_percent,
    fill = common_DETs
  )) +
  geom_violin() +
  geom_boxplot(width = .1, outlier.colour = NA, fill = 'white') +
  scale_fill_manual(values = c('blue', 'gray', 'red')) +
  facet_wrap(~ is_methylated)
## Warning: Groups with fewer than two data points have been dropped.

espresso_quantification_numCC |> 
  group_by(is_methylated) |> 
  rstatix::wilcox_test(CC_percent ~ common_DETs)
## # A tibble: 6 × 10
##   is_methylated .y.        group1 group2    n1    n2 statistic         p   p.adj
## * <lgl>         <chr>      <chr>  <chr>  <int> <int>     <dbl>     <dbl>   <dbl>
## 1 FALSE         CC_percent down   other    539 35699 10633184    2.68e-5 5.36e-5
## 2 FALSE         CC_percent down   up       539   408   132024.   1.19e-7 3.57e-7
## 3 FALSE         CC_percent other  up     35699   408  8104528    8.63e-5 8.63e-5
## 4 TRUE          CC_percent down   other      1    40       12    6.34e-1 1   e+0
## 5 TRUE          CC_percent down   up         1    30        8    5.81e-1 1   e+0
## 6 TRUE          CC_percent other  up        40    30      541    4.9 e-1 1   e+0
## # ℹ 1 more variable: p.adj.signif <chr>
espresso_quantification_numCC |> 
  ggplot(aes(
    x = is_methylated, y = CC_percent,
    fill = common_DETs
  )) +
  geom_violin() +
  geom_boxplot(width = .1, outlier.colour = NA, fill = 'white') +
  scale_fill_manual(values = c('blue', 'gray', 'red')) +
  facet_wrap(~ common_DETs)
## Warning: Groups with fewer than two data points have been dropped.

espresso_quantification_numCC |> 
  group_by(common_DETs) |> 
  rstatix::wilcox_test(CC_percent ~ is_methylated)
## # A tibble: 3 × 8
##   common_DETs .y.        group1 group2    n1    n2 statistic       p
## * <chr>       <chr>      <chr>  <chr>  <int> <int>     <dbl>   <dbl>
## 1 down        CC_percent FALSE  TRUE     539     1      341  0.649  
## 2 other       CC_percent FALSE  TRUE   35699    40   606732. 0.1    
## 3 up          CC_percent FALSE  TRUE     408    30     4181  0.00377

ECDF (intensity up)

C

percent_C_intensityup_ecdf <- 
  espresso_quantification_numCC |> 
  ggplot(aes(
    colour = is_intensityup, 
    x = C_percent
  )) +
  stat_ecdf() +
  scale_color_manual(values = c('gray', '#1A8F3B'))
percent_C_intensityup_ecdf |> 
  ggsave_multiple_formats(
    width = 4, height = 3.5, outdir = figdir, units = 'cm'
  )

espresso_quantification_numCC |> 
  rstatix::wilcox_test(C_percent ~ is_intensityup)
## # A tibble: 1 × 7
##   .y.       group1 group2    n1    n2 statistic      p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl>
## 1 C_percent FALSE  TRUE   36632    85  1327510. 0.0188

CC

percent_CC_intensityup_ecdf <- 
  espresso_quantification_numCC |> 
  ggplot(aes(
    colour = is_intensityup, 
    x = CC_percent
  )) +
  stat_ecdf() +
  scale_color_manual(values = c('gray', '#1A8F3B'))
percent_CC_intensityup_ecdf |> 
  ggsave_multiple_formats(
    width = 4, height = 3.5, outdir = figdir, units = 'cm'
  )

espresso_quantification_numCC |> 
  rstatix::wilcox_test(CC_percent ~ is_intensityup)
## # A tibble: 1 × 7
##   .y.        group1 group2    n1    n2 statistic      p
## * <chr>      <chr>  <chr>  <int> <int>     <dbl>  <dbl>
## 1 CC_percent FALSE  TRUE   36632    85  1330268. 0.0203

Violin (commonDETs)

CC, common DETs

espresso_quantification_numCC |> 
  ggplot(aes(
    x = common_DETs, y = CC_percent, fill = common_DETs
  )) +
  geom_violin() +
  geom_boxplot(width = .1, coef = Inf, fill = 'white') +
  scale_fill_manual(values = c('blue', 'gray', 'red')) 

espresso_quantification_numCC |> 
  rstatix::wilcox_test(CC_percent ~ common_DETs)
## # A tibble: 3 × 9
##   .y.        group1 group2    n1    n2 statistic          p   p.adj p.adj.signif
## * <chr>      <chr>  <chr>  <int> <int>     <dbl>      <dbl>   <dbl> <chr>       
## 1 CC_percent down   other    540 35739 10659050  0.0000292  5.84e-5 ****        
## 2 CC_percent down   up       540   438   139620. 0.00000116 3.48e-6 ****        
## 3 CC_percent other  up     35739   438  8539248. 0.001      1   e-3 **

C, common DETs

espresso_quantification_numCC |> 
  ggplot(aes(
    x = common_DETs, y = C_percent, fill = common_DETs
  )) +
  geom_violin() +
  geom_boxplot(width = .1, coef = Inf, fill = 'white') +
  scale_fill_manual(values = c('blue', 'gray', 'red')) 

espresso_quantification_numCC |> 
  rstatix::wilcox_test(C_percent ~ common_DETs)
## # A tibble: 3 × 9
##   .y.       group1 group2    n1    n2 statistic          p    p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl>      <dbl>    <dbl> <chr>       
## 1 C_percent down   other    540 35739 10616316. 0.0000627   1.25e-4 ***         
## 2 C_percent down   up       540   438   138110. 0.00000622  1.87e-5 ****        
## 3 C_percent other  up     35739   438  8477800  0.003       3   e-3 **